# Importing required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Importing the data
df = pd.read_csv(r"C:\Users\shinu\Downloads\Rainfall Prediction\weatherAUS.csv")
df
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RISK_MM | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | ... | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | 0.0 | No |
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | 0.0 | No |
| 2 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | 0.0 | No |
| 3 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | 1.0 | No |
| 4 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | 0.2 | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 142188 | 2017-06-20 | Uluru | 3.5 | 21.8 | 0.0 | NaN | NaN | E | 31.0 | ESE | ... | 27.0 | 1024.7 | 1021.2 | NaN | NaN | 9.4 | 20.9 | No | 0.0 | No |
| 142189 | 2017-06-21 | Uluru | 2.8 | 23.4 | 0.0 | NaN | NaN | E | 31.0 | SE | ... | 24.0 | 1024.6 | 1020.3 | NaN | NaN | 10.1 | 22.4 | No | 0.0 | No |
| 142190 | 2017-06-22 | Uluru | 3.6 | 25.3 | 0.0 | NaN | NaN | NNW | 22.0 | SE | ... | 21.0 | 1023.5 | 1019.1 | NaN | NaN | 10.9 | 24.5 | No | 0.0 | No |
| 142191 | 2017-06-23 | Uluru | 5.4 | 26.9 | 0.0 | NaN | NaN | N | 37.0 | SE | ... | 24.0 | 1021.0 | 1016.8 | NaN | NaN | 12.5 | 26.1 | No | 0.0 | No |
| 142192 | 2017-06-24 | Uluru | 7.8 | 27.0 | 0.0 | NaN | NaN | SE | 28.0 | SSE | ... | 24.0 | 1019.4 | 1016.5 | 3.0 | 2.0 | 15.1 | 26.0 | No | 0.0 | No |
142193 rows × 24 columns
df.head()
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RISK_MM | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | ... | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | 0.0 | No |
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | 0.0 | No |
| 2 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | 0.0 | No |
| 3 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | 1.0 | No |
| 4 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | 0.2 | No |
5 rows × 24 columns
df.tail()
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RISK_MM | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 142188 | 2017-06-20 | Uluru | 3.5 | 21.8 | 0.0 | NaN | NaN | E | 31.0 | ESE | ... | 27.0 | 1024.7 | 1021.2 | NaN | NaN | 9.4 | 20.9 | No | 0.0 | No |
| 142189 | 2017-06-21 | Uluru | 2.8 | 23.4 | 0.0 | NaN | NaN | E | 31.0 | SE | ... | 24.0 | 1024.6 | 1020.3 | NaN | NaN | 10.1 | 22.4 | No | 0.0 | No |
| 142190 | 2017-06-22 | Uluru | 3.6 | 25.3 | 0.0 | NaN | NaN | NNW | 22.0 | SE | ... | 21.0 | 1023.5 | 1019.1 | NaN | NaN | 10.9 | 24.5 | No | 0.0 | No |
| 142191 | 2017-06-23 | Uluru | 5.4 | 26.9 | 0.0 | NaN | NaN | N | 37.0 | SE | ... | 24.0 | 1021.0 | 1016.8 | NaN | NaN | 12.5 | 26.1 | No | 0.0 | No |
| 142192 | 2017-06-24 | Uluru | 7.8 | 27.0 | 0.0 | NaN | NaN | SE | 28.0 | SSE | ... | 24.0 | 1019.4 | 1016.5 | 3.0 | 2.0 | 15.1 | 26.0 | No | 0.0 | No |
5 rows × 24 columns
df['WindGustDir'].value_counts().plot(kind="pie",autopct="%.2f%%",figsize=(10,7))
plt.show()
df['WindDir9am'].value_counts().plot(kind="pie",autopct="%.2f%%",figsize=(10,7))
plt.show()
df['WindDir3pm'].value_counts().plot(kind="pie",autopct="%.2f%%",figsize=(10,7))
plt.show()
df['Humidity3pm'].value_counts().plot(kind="bar",figsize=(20,15))
plt.show()
df['RainTomorrow'].value_counts().plot(kind="pie")
plt.show()
ax=pd.crosstab(df['WindGustDir'],df['WindDir9am']).plot(kind="bar",stacked=True,figsize = (25,15))
for i in ax.containers:
ax.bar_label(i)
ax=pd.crosstab(df['WindDir9am'],df['WindDir3pm']).plot(kind="bar",stacked=True,figsize = (25,15))
for i in ax.containers:
ax.bar_label(i)
ax=pd.crosstab(df['RainToday'],df['RainTomorrow']).plot(kind="bar",stacked=True)
for i in ax.containers:
ax.bar_label(i)
# Pairplot for whole data tells you the distribution for single column and that single column with all other different columns
sns.pairplot(df,diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x1e50240b7d0>
# Finding the distribution for each single columnb
df.hist(figsize=(20,15))
plt.show()
# Checking Correlation between two variabless using Heatmap
k=df.select_dtypes(include=['int','float'])
cor=k.corr()
plt.figure(figsize=(20,15))
sns.heatmap(cor,annot=True)
plt.show()
# sorting the data based on date (Time based splitting)
df = df.sort_values(by='Date')
#After sorting the data index's got changed to get that in order we doing reset index
df.reset_index(inplace=True)
df.drop("index",axis = 1,inplace=True)
#Removing unwanted features, RISK_MM is same as target label hence removing with date and loaction
df.drop(['Date', 'Location','RISK_MM'], axis=1,inplace = True)
df.head()
| MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | WindDir3pm | WindSpeed9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8.0 | 24.3 | 0.0 | 3.4 | 6.3 | NW | 30.0 | SW | NW | 6.0 | ... | 68.0 | 29.0 | 1019.7 | 1015.0 | 7.0 | 7.0 | 14.4 | 23.6 | No | Yes |
| 1 | 14.0 | 26.9 | 3.6 | 4.4 | 9.7 | ENE | 39.0 | E | W | 4.0 | ... | 80.0 | 36.0 | 1012.4 | 1008.4 | 5.0 | 3.0 | 17.5 | 25.7 | Yes | Yes |
| 2 | 13.7 | 23.4 | 3.6 | 5.8 | 3.3 | NW | 85.0 | N | NNE | 6.0 | ... | 82.0 | 69.0 | 1009.5 | 1007.2 | 8.0 | 7.0 | 15.4 | 20.2 | Yes | Yes |
| 3 | 13.3 | 15.5 | 39.8 | 7.2 | 9.1 | NW | 54.0 | WNW | W | 30.0 | ... | 62.0 | 56.0 | 1005.5 | 1007.0 | 2.0 | 7.0 | 13.5 | 14.1 | Yes | Yes |
| 4 | 7.6 | 16.1 | 2.8 | 5.6 | 10.6 | SSE | 50.0 | SSE | ESE | 20.0 | ... | 68.0 | 49.0 | 1018.3 | 1018.5 | 7.0 | 7.0 | 11.1 | 15.4 | Yes | No |
5 rows × 21 columns
df.dtypes
MinTemp float64 MaxTemp float64 Rainfall float64 Evaporation float64 Sunshine float64 WindGustDir object WindGustSpeed float64 WindDir9am object WindDir3pm object WindSpeed9am float64 WindSpeed3pm float64 Humidity9am float64 Humidity3pm float64 Pressure9am float64 Pressure3pm float64 Cloud9am float64 Cloud3pm float64 Temp9am float64 Temp3pm float64 RainToday object RainTomorrow object dtype: object
# Both variables categorical - chi square test of independence
pd.crosstab(df['WindGustDir'],df['WindDir9am'])
| WindDir9am | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| WindGustDir | ||||||||||||||||
| E | 2251 | 1169 | 1353 | 168 | 424 | 219 | 136 | 196 | 312 | 801 | 517 | 257 | 308 | 134 | 218 | 164 |
| ENE | 1225 | 1560 | 699 | 256 | 808 | 383 | 166 | 195 | 281 | 535 | 413 | 230 | 248 | 201 | 172 | 182 |
| ESE | 1000 | 429 | 1478 | 157 | 233 | 170 | 104 | 133 | 388 | 1185 | 707 | 267 | 240 | 146 | 130 | 137 |
| N | 310 | 431 | 177 | 2966 | 785 | 1441 | 677 | 420 | 95 | 180 | 125 | 102 | 126 | 198 | 223 | 125 |
| NE | 573 | 918 | 344 | 472 | 1255 | 834 | 285 | 322 | 180 | 368 | 228 | 152 | 173 | 153 | 182 | 128 |
| NNE | 321 | 435 | 204 | 961 | 900 | 1433 | 354 | 340 | 105 | 200 | 138 | 103 | 100 | 154 | 143 | 113 |
| NNW | 229 | 257 | 136 | 1422 | 381 | 570 | 1329 | 719 | 99 | 135 | 114 | 69 | 87 | 151 | 256 | 90 |
| NW | 245 | 262 | 139 | 1196 | 343 | 520 | 1385 | 1484 | 120 | 151 | 132 | 107 | 135 | 349 | 710 | 179 |
| S | 243 | 197 | 326 | 232 | 160 | 185 | 178 | 239 | 1894 | 593 | 1120 | 1268 | 850 | 308 | 255 | 475 |
| SE | 663 | 350 | 1042 | 170 | 252 | 153 | 178 | 161 | 927 | 2014 | 1761 | 419 | 311 | 162 | 162 | 161 |
| SSE | 361 | 204 | 548 | 189 | 142 | 128 | 120 | 136 | 1409 | 1338 | 2093 | 721 | 465 | 262 | 215 | 279 |
| SSW | 304 | 260 | 247 | 282 | 228 | 233 | 204 | 224 | 1112 | 318 | 542 | 1629 | 1247 | 404 | 306 | 629 |
| SW | 296 | 291 | 232 | 383 | 253 | 267 | 280 | 289 | 609 | 277 | 348 | 973 | 1683 | 695 | 342 | 1106 |
| W | 242 | 260 | 113 | 756 | 297 | 378 | 725 | 910 | 163 | 148 | 162 | 237 | 527 | 2051 | 1349 | 966 |
| WNW | 174 | 207 | 110 | 797 | 274 | 450 | 925 | 1289 | 121 | 123 | 134 | 128 | 210 | 828 | 1477 | 340 |
| WSW | 224 | 276 | 162 | 528 | 305 | 339 | 424 | 469 | 334 | 164 | 220 | 487 | 1019 | 1305 | 705 | 1456 |
# Null-there is no association between both variables
# Alt-There is association between both variables
from scipy.stats import chi2_contingency
chi2_contingency(pd.crosstab(df['WindGustDir'],df['WindDir9am']))
#Since Pvalue = 0.0 Not Reject Null
Chi2ContingencyResult(statistic=110952.90285769245, pvalue=0.0, dof=225, expected_freq=array([[596.13562527, 516.63710926, 503.14645199, 752.65478147,
484.56238332, 530.1965964 , 514.15923343, 518.01370694,
560.89472466, 587.11891047, 602.53680448, 492.06484067,
531.98617339, 516.29295984, 471.14055594, 449.45914248],
[521.99009079, 452.37935822, 440.56662784, 659.04187078,
424.29398905, 464.25235762, 450.20967304, 453.58473887,
491.13234614, 514.09484753, 527.59511082, 430.8633136 ,
465.81935247, 452.07801305, 412.54152771, 393.55678246],
[477.07434298, 413.45341397, 402.65713511, 602.333211 ,
387.78471014, 424.30477589, 411.47042397, 414.55507508,
448.8718186 , 469.85846272, 482.19706713, 393.78876318,
425.73693533, 413.17799869, 377.04351434, 359.69235188],
[579.13674225, 501.90513651, 488.79916705, 731.19273484,
470.74502545, 515.07797316, 499.49791763, 503.24248033,
544.9007404 , 570.37714021, 585.35539102, 478.03354928,
516.81652013, 501.57080056, 457.70592318, 436.64275798],
[453.78725526, 393.27180903, 383.00252118, 572.931952 ,
368.85605323, 403.5934912 , 391.3856133 , 394.31969554,
426.96136048, 446.92359859, 458.65992756, 374.56703474,
404.95574367, 393.0098374 , 358.63915971, 342.1349471 ],
[414.88330754, 359.5559527 , 350.16706825, 523.81352822,
337.23340088, 368.99273963, 357.83146372, 360.51400214,
390.35724202, 408.60808374, 419.33823741, 342.4547703 ,
370.2382039 , 359.31644035, 327.8924189 , 312.80314031],
[417.64735356, 361.95139543, 352.49996011, 527.3032919 ,
339.48012574, 371.45105235, 360.21541751, 362.9158276 ,
392.95788987, 411.33032281, 422.13196317, 344.7362811 ,
372.70481418, 361.71028738, 330.07691203, 314.88710527],
[515.2872792 , 446.57040961, 434.90936508, 650.57919386,
418.84568128, 458.29094927, 444.42858511, 447.76031212,
484.8257751 , 507.4934178 , 520.82032584, 425.33064992,
459.83782253, 446.27293399, 407.24413187, 388.50316744],
[588.94910562, 510.40895818, 497.08093316, 743.58139591,
478.72089869, 523.80498333, 507.96095358, 511.76896073,
554.13304026, 580.0410889 , 595.27311749, 486.1329126 ,
525.57298664, 510.06895754, 465.4608738 , 444.04083359],
[614.03282325, 532.14760089, 518.25192679, 775.25100129,
499.10992676, 546.11417128, 529.59533422, 533.56552682,
577.73391948, 604.74540842, 620.62617881, 506.83762307,
547.95747499, 531.79311941, 485.28514896, 462.95281559],
[594.96090571, 515.6190461 , 502.15497295, 751.17163191,
483.60752525, 529.1518135 , 513.14605307, 516.99293111,
559.78944933, 585.96195886, 601.34947103, 491.09519858,
530.93786402, 515.27557485, 470.21214636, 448.57345737],
[564.48729835, 489.20929008, 476.43484019, 712.69698735,
458.83738371, 502.04891573, 486.86296255, 490.51280537,
531.1173068 , 555.94927317, 570.54864447, 465.94154207,
503.74348561, 488.88341126, 446.12810959, 425.5977437 ],
[575.19797667, 498.49163063, 485.47479615, 726.2198216 ,
467.54344253, 511.57487753, 496.10078348, 499.81987905,
541.19481721, 566.49794954, 581.37433181, 474.7823964 ,
513.30160047, 498.15956853, 454.59302047, 433.67310792],
[641.53508114, 555.98225598, 541.4642008 , 809.9741499 ,
521.46483908, 570.57438287, 553.31567442, 557.46369018,
603.61036557, 631.83168712, 648.42375018, 529.53865548,
572.50024733, 555.61189743, 507.02085561, 483.68826693],
[524.27042876, 454.35559846, 442.49126362, 661.92092582,
426.14753706, 466.28046562, 452.17643492, 455.56624487,
493.27788061, 516.34069476, 529.89993458, 432.74556001,
467.85430596, 454.05293686, 414.34373454, 395.27605355],
[581.62438367, 504.06103496, 490.89876973, 734.33352216,
472.76707782, 517.29045461, 501.64347604, 505.40412325,
547.24132346, 572.82715537, 587.86974421, 480.086909 ,
519.03646939, 503.72526289, 459.671967 , 438.51832645]]))
pd.crosstab(df['WindDir9am'],df['WindDir3pm'])
| WindDir3pm | E | ENE | ESE | N | NE | NNE | NNW | NW | S | SE | SSE | SSW | SW | W | WNW | WSW |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| WindDir9am | ||||||||||||||||
| E | 1520 | 1049 | 1110 | 412 | 655 | 416 | 362 | 349 | 342 | 752 | 467 | 314 | 348 | 262 | 244 | 308 |
| ENE | 842 | 1110 | 553 | 592 | 818 | 564 | 415 | 325 | 237 | 377 | 274 | 265 | 338 | 338 | 244 | 368 |
| ESE | 1091 | 775 | 1265 | 232 | 417 | 219 | 207 | 178 | 365 | 1066 | 610 | 249 | 297 | 172 | 147 | 207 |
| N | 247 | 292 | 188 | 2224 | 698 | 905 | 1687 | 1363 | 250 | 219 | 199 | 255 | 408 | 839 | 955 | 524 |
| NE | 379 | 628 | 264 | 890 | 1045 | 897 | 565 | 467 | 187 | 277 | 170 | 194 | 284 | 364 | 380 | 355 |
| NNE | 233 | 353 | 190 | 1522 | 831 | 1097 | 791 | 624 | 174 | 148 | 136 | 175 | 242 | 506 | 470 | 387 |
| NNW | 182 | 187 | 211 | 597 | 396 | 379 | 1150 | 1259 | 181 | 273 | 121 | 177 | 300 | 827 | 1068 | 450 |
| NW | 244 | 226 | 222 | 410 | 463 | 377 | 671 | 1356 | 199 | 292 | 166 | 207 | 355 | 1070 | 1347 | 596 |
| S | 370 | 314 | 515 | 104 | 249 | 119 | 133 | 141 | 1753 | 969 | 1458 | 922 | 651 | 187 | 161 | 345 |
| SE | 769 | 667 | 1169 | 218 | 458 | 243 | 242 | 185 | 694 | 1934 | 1159 | 376 | 306 | 166 | 182 | 189 |
| SSE | 559 | 469 | 882 | 171 | 303 | 149 | 179 | 176 | 1097 | 1695 | 1674 | 535 | 447 | 170 | 145 | 245 |
| SSW | 286 | 278 | 381 | 95 | 211 | 100 | 84 | 120 | 1357 | 523 | 780 | 1327 | 982 | 204 | 141 | 449 |
| SW | 310 | 292 | 327 | 125 | 263 | 142 | 103 | 158 | 1071 | 522 | 617 | 1195 | 1387 | 471 | 194 | 885 |
| W | 388 | 245 | 289 | 177 | 248 | 129 | 168 | 327 | 438 | 256 | 328 | 491 | 792 | 1696 | 817 | 1358 |
| WNW | 327 | 214 | 237 | 186 | 259 | 132 | 229 | 597 | 266 | 244 | 224 | 260 | 425 | 1298 | 1366 | 868 |
| WSW | 208 | 213 | 228 | 112 | 167 | 106 | 101 | 164 | 563 | 285 | 348 | 655 | 1127 | 835 | 323 | 1331 |
chi2_contingency(pd.crosstab(df['WindDir9am'],df['WindDir3pm']))
# Since pvalue=0.0 , Not Reject Null
Chi2ContingencyResult(statistic=79917.66217942107, pvalue=0.0, dof=225, expected_freq=array([[544.49885921, 500.48719781, 549.70085963, 552.16496508,
512.05480399, 408.90461156, 485.08653868, 533.13659515,
627.93620797, 672.97457998, 597.61402134, 519.99469936,
594.73923164, 643.74755134, 560.17330783, 606.78596944],
[468.11013036, 430.27294447, 472.58233274, 474.70074439,
440.21771028, 351.53864473, 417.03287164, 458.34189886,
539.8419027 , 578.56176012, 513.77367042, 447.04370338,
511.30219016, 553.43504413, 481.58558226, 521.65886935],
[458.14904012, 421.11700583, 462.52607684, 464.59941002,
430.85015326, 344.05812265, 408.15867346, 448.58867046,
528.35440529, 566.25032841, 502.84088866, 437.5308935 ,
500.42199995, 541.65829319, 471.33774285, 510.5582955 ],
[687.68189256, 632.09679427, 694.25182642, 697.36390035,
646.70625245, 516.43137978, 612.64633219, 673.33177387,
793.06017377, 849.94196953, 754.7643751 , 656.73404623,
751.13362218, 813.02931483, 707.47814063, 766.34820585],
[448.92128168, 412.63512403, 453.21015879, 455.24173216,
422.17223234, 337.12831386, 399.93779048, 439.55347115,
517.71261321, 554.84525977, 492.71297427, 428.71841319,
490.34280534, 530.74854232, 461.8443456 , 500.27494181],
[481.49343566, 442.57448165, 486.09349865, 488.27247586,
452.8035691 , 361.58916212, 428.95587411, 471.44592965,
555.27602498, 595.10288616, 528.4624999 , 459.82471788,
525.92035983, 569.2577954 , 495.35415178, 536.57313729],
[474.09900671, 435.77774193, 478.62842525, 480.7739393 ,
445.84973843, 356.03613653, 422.36827914, 464.20580305,
546.74849623, 585.9637252 , 520.34675394, 452.76306146,
517.84365421, 560.5155447 , 487.74685995, 528.332834 ],
[501.17117221, 460.66167331, 505.95923118, 508.22725911,
471.30880444, 376.36663517, 446.48649874, 490.71304341,
577.96911802, 619.42362856, 550.0597743 , 478.61689444,
547.41374171, 592.52229725, 515.59834989, 558.50187827],
[512.78225899, 471.33423982, 517.68124726, 520.00182065,
482.22804268, 385.08626213, 456.83065613, 502.08183725,
591.35945242, 633.77437717, 562.80350764, 489.70544583,
560.09617202, 606.24979835, 527.54368417, 571.44119748],
[547.37107542, 503.12725373, 552.60051624, 555.07761978,
514.75587872, 411.06157191, 487.64535656, 535.94887573,
631.24855385, 676.52450201, 600.76641854, 522.73765681,
597.8764644 , 647.14330161, 563.12820631, 609.9867484 ],
[543.64330545, 499.70079817, 548.83713212, 551.29736581,
511.25022854, 408.26211273, 484.32433761, 532.29889455,
626.94955175, 671.9171564 , 596.67500941, 519.17764821,
593.80473677, 642.73605125, 559.2931253 , 605.83254592],
[447.21017415, 411.06232475, 451.48270379, 453.50653361,
420.56308144, 335.8433162 , 398.41338834, 437.87806995,
515.73930078, 552.7304126 , 490.83495041, 427.08431088,
488.47381561, 528.72554216, 460.08398055, 498.36809477],
[492.67674556, 452.85384834, 497.38365099, 499.61323777,
463.32051962, 369.98753966, 438.91893096, 482.39587318,
568.17303127, 608.92492299, 540.73672728, 470.50474369,
538.1355427 , 582.47954645, 506.85939481, 549.03574474],
[497.87117912, 457.62841757, 502.62771082, 504.88080478,
468.20544199, 373.8884254 , 443.54658032, 487.48191253,
574.16344403, 615.34499474, 546.43787114, 475.46541141,
543.80926152, 588.62079694, 512.20336014, 554.82438755],
[435.8435313 , 400.61444386, 440.00746699, 441.97985757,
409.87372189, 327.30726034, 388.28700268, 426.74861915,
502.63086815, 538.68178501, 478.3595062 , 416.22920268,
476.05838384, 515.28704109, 448.39012698, 485.70118227],
[413.47691149, 380.05571048, 417.42716231, 419.29833376,
388.83982085, 310.51050525, 368.36088897, 404.84873207,
476.83685557, 511.03771135, 453.81105145, 394.86915105,
451.6280181 , 488.84353898, 425.37964094, 460.77596737]]))
#Checking for null valuess
df.isna().sum()
MinTemp 637 MaxTemp 322 Rainfall 1406 Evaporation 60843 Sunshine 67816 WindGustDir 9330 WindGustSpeed 9270 WindDir9am 10013 WindDir3pm 3778 WindSpeed9am 1348 WindSpeed3pm 2630 Humidity9am 1774 Humidity3pm 3610 Pressure9am 14014 Pressure3pm 13981 Cloud9am 53657 Cloud3pm 57094 Temp9am 904 Temp3pm 2726 RainToday 1406 RainTomorrow 0 dtype: int64
df.columns
Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
'Temp3pm', 'RainToday', 'RainTomorrow'],
dtype='object')
#Creating a list to fill the Median for numerical columns
Numerical_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
'Sunshine', 'WindGustSpeed','WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
'Temp3pm']
#Filling the values and making permanent change
for c in Numerical_cols:
p = df[c].median()
df[c] = df[c].fillna(p)
##Creating a list to fill the Mean for categorical columns
Categorical_cols = ["WindGustDir","WindDir9am","WindDir3pm","RainToday"]
#Filling the values and making permanent change
for x in Categorical_cols:
q = df[x].mode()[0]
df[x] = df[x].fillna(q)
df.dtypes
MinTemp float64 MaxTemp float64 Rainfall float64 Evaporation float64 Sunshine float64 WindGustDir object WindGustSpeed float64 WindDir9am object WindDir3pm object WindSpeed9am float64 WindSpeed3pm float64 Humidity9am float64 Humidity3pm float64 Pressure9am float64 Pressure3pm float64 Cloud9am float64 Cloud3pm float64 Temp9am float64 Temp3pm float64 RainToday object RainTomorrow object dtype: object
import warnings
warnings.filterwarnings("ignore")
#After EDA we having some categorical columns i.e., WindGustDir,WindDir9am,WindDir3pm,RainToday,RainTomorrow
#For Rain Tomorrow column we use LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["RainTomorrow"] = le.fit_transform(df[["RainTomorrow"]])
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["WindGustDir"] = le.fit_transform(df[["WindGustDir"]])
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["WindDir9am"] = le.fit_transform(df[["WindDir9am"]])
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["WindDir3pm"] = le.fit_transform(df[["WindDir3pm"]])
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["RainToday"] = le.fit_transform(df[["RainToday"]])
df
| MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | WindDir3pm | WindSpeed9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8.0 | 24.3 | 0.0 | 3.4 | 6.3 | 7 | 30.0 | 12 | 7 | 6.0 | ... | 68.0 | 29.0 | 1019.7 | 1015.0 | 7.0 | 7.0 | 14.4 | 23.6 | 0 | 1 |
| 1 | 14.0 | 26.9 | 3.6 | 4.4 | 9.7 | 1 | 39.0 | 0 | 13 | 4.0 | ... | 80.0 | 36.0 | 1012.4 | 1008.4 | 5.0 | 3.0 | 17.5 | 25.7 | 1 | 1 |
| 2 | 13.7 | 23.4 | 3.6 | 5.8 | 3.3 | 7 | 85.0 | 3 | 5 | 6.0 | ... | 82.0 | 69.0 | 1009.5 | 1007.2 | 8.0 | 7.0 | 15.4 | 20.2 | 1 | 1 |
| 3 | 13.3 | 15.5 | 39.8 | 7.2 | 9.1 | 7 | 54.0 | 14 | 13 | 30.0 | ... | 62.0 | 56.0 | 1005.5 | 1007.0 | 2.0 | 7.0 | 13.5 | 14.1 | 1 | 1 |
| 4 | 7.6 | 16.1 | 2.8 | 5.6 | 10.6 | 10 | 50.0 | 10 | 2 | 20.0 | ... | 68.0 | 49.0 | 1018.3 | 1018.5 | 7.0 | 7.0 | 11.1 | 15.4 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 142188 | -0.1 | 12.7 | 0.0 | 4.8 | 8.5 | 11 | 19.0 | 11 | 10 | 2.0 | ... | 92.0 | 54.0 | 1020.8 | 1018.5 | 8.0 | 8.0 | 4.0 | 10.7 | 0 | 0 |
| 142189 | 7.6 | 19.3 | 0.0 | 3.4 | 9.4 | 13 | 35.0 | 13 | 13 | 13.0 | ... | 73.0 | 32.0 | 1018.6 | 1015.4 | 1.0 | 1.0 | 9.4 | 18.8 | 0 | 0 |
| 142190 | 2.7 | 20.0 | 0.2 | 4.8 | 8.5 | 13 | 37.0 | 3 | 14 | 0.0 | ... | 88.0 | 35.0 | 1018.8 | 1015.2 | 5.0 | 5.0 | 9.6 | 19.4 | 0 | 0 |
| 142191 | 3.1 | 20.0 | 0.0 | 4.8 | 8.5 | 12 | 28.0 | 9 | 8 | 2.0 | ... | 95.0 | 45.0 | 1017.6 | 1015.2 | 5.0 | 5.0 | 7.8 | 17.8 | 0 | 0 |
| 142192 | 6.8 | 19.5 | 0.0 | 3.4 | 9.4 | 15 | 46.0 | 7 | 15 | 19.0 | ... | 67.0 | 42.0 | 1018.4 | 1015.2 | 1.0 | 1.0 | 11.9 | 17.9 | 0 | 0 |
142193 rows × 21 columns
X = df.drop("RainTomorrow",axis = 1)
Y = df["RainTomorrow"]
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.30)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X,Y)
lr.score(X,Y)
0.838318341971827
from sklearn.model_selection import cross_val_score
cv=cross_val_score(lr,X,Y)
print(cv)
print()
print(cv.mean())
[0.84310278 0.83835578 0.84292697 0.8320557 0.83497433] 0.8382831114316976
RocCurveDisplay.from_predictions(Y_test,lr_pred)
plt.show()
from sklearn.metrics import confusion_matrix,classification_report,f1_score,RocCurveDisplay
lr_pred = lr.predict(X_test)
print(confusion_matrix(Y_test,lr_pred))
[[31256 1859] [ 5058 4485]]
print(classification_report(Y_test,lr_pred))
precision recall f1-score support
0 0.86 0.94 0.90 33115
1 0.71 0.47 0.56 9543
accuracy 0.84 42658
macro avg 0.78 0.71 0.73 42658
weighted avg 0.83 0.84 0.83 42658
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=10)
dt.fit(X,Y)
dt.score(X,Y)
0.8600423368238943
cv1=cross_val_score(dt,X,Y)
print(cv1)
print()
print(cv1.mean())
[0.8371954 0.83431204 0.83902388 0.83328645 0.83170406] 0.8351043636590996
RocCurveDisplay.from_predictions(Y_test,dt_pred)
plt.show()
from sklearn.metrics import confusion_matrix,classification_report
dt_pred = dt.predict(X_test)
print(confusion_matrix(Y_test,dt_pred))
[[31620 1495] [ 4460 5083]]
print(classification_report(Y_test,dt_pred))
precision recall f1-score support
0 0.88 0.95 0.91 33115
1 0.77 0.53 0.63 9543
accuracy 0.86 42658
macro avg 0.82 0.74 0.77 42658
weighted avg 0.85 0.86 0.85 42658
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=10)
rfc.fit(X,Y)
rfc.score(X,Y)
0.8631718860984718
cv2=cross_val_score(rfc,X,Y)
print(cv2)
print()
print(cv2.mean())
[0.84988924 0.8483069 0.8508738 0.84028413 0.84123356] 0.8461175253538282
from sklearn.metrics import confusion_matrix,classification_report
rfc_pred = rfc.predict(X_test)
RocCurveDisplay.from_predictions(Y_test,rfc_pred)
plt.show()
print(confusion_matrix(Y_test,rfc_pred))
[[32136 979] [ 4850 4693]]
print(classification_report(Y_test,rfc_pred))
precision recall f1-score support
0 0.87 0.97 0.92 33115
1 0.83 0.49 0.62 9543
accuracy 0.86 42658
macro avg 0.85 0.73 0.77 42658
weighted avg 0.86 0.86 0.85 42658
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X,Y)
gbc.score(X,Y)
0.8495636212753089
cv3=cross_val_score(gbc,X,Y)
print(cv3)
print()
print(cv3.mean())
[0.84967826 0.84851788 0.85206934 0.84225332 0.84288628] 0.8470810164886384
from sklearn.metrics import confusion_matrix,classification_report
gbc_pred = gbc.predict(X_test)
RocCurveDisplay.from_predictions(Y_test,gbc_pred)
plt.show()
print(confusion_matrix(Y_test,gbc_pred))
[[31434 1681] [ 4724 4819]]
print(classification_report(Y_test,gbc_pred))
precision recall f1-score support
0 0.87 0.95 0.91 33115
1 0.74 0.50 0.60 9543
accuracy 0.85 42658
macro avg 0.81 0.73 0.75 42658
weighted avg 0.84 0.85 0.84 42658
from xgboost import XGBClassifier
xg = XGBClassifier()
xg.fit(X,Y)
xg.score(X,Y)
0.8796846539562426
xg_pred = xg.predict(X_test)
RocCurveDisplay.from_predictions(Y_test,xg_pred)
plt.show()